import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#Supress warnings
import warnings
warnings.filterwarnings('ignore')
bank_df=pd.read_csv("bank-full.csv")
bank_df.head()
#Imported shape is correct as per given data
bank_df.shape
#most of the features are of int and object type
bank_df.info()
#there are no blanks in data
bank_df.isnull().sum()
bank_df.dtypes
#All object type feature freq check
bank_df_obj=pd.DataFrame(bank_df.select_dtypes(include='object'))
for col in bank_df_obj.columns:
print("Feature Name: ", col)
sns.countplot(bank_df[col])
print(bank_df_obj[col].value_counts(),"\n")
#job analysis
s=bank_df.job
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(15,8))
sns.countplot(bank_df['job'],color='blue')
#marital analysis
s=bank_df.marital
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bank_df['marital'],palette='Set1')
#education analysis
s=bank_df.education
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bank_df['education'],color='blue')
#default analysis - yes percentage is very low, need to inspet furhter to include in analysius or not
s=bank_df.default
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bank_df['default'],color='blue')
#housing analysis
s=bank_df.housing
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bank_df['housing'],color='blue')
#loan analysis
s=bank_df.loan
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bank_df['loan'],color='blue')
#contact analysis
s=bank_df.contact
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bank_df['contact'],color='blue')
#day analysis
s=bank_df.day
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(15,5))
sns.countplot(bank_df['day'],color='blue')
sns.boxplot(bank_df['day'])
#month analysis - max respondent connected in may month
s=bank_df.month
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(12,5))
sns.countplot(bank_df['month'],color='blue')
#poutcome analysis - most data is unkown, need to check if could be used in further analysis
s=bank_df.poutcome
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bank_df['poutcome'],color='blue')
#Target analysis - those subscribed for term deposit are very low. Data has imbalance but will continue with same
#as this represents original picture of data. Subscription to product (term deposit in this case)
#tend to be on lower side
s=bank_df.Target
counts=s.value_counts()
percent=s.value_counts(normalize=True).mul(100).round(1)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bank_df['Target'],color='blue')
bank_df.describe()
#Age, data is moderately skewed.
print(sns.distplot(bank_df.age))
print("Skweness is: ",bank_df['age'].skew())
sns.boxplot(y='age',data=bank_df)
#balance, is highly skewed and contains lot of outliers.
print(sns.distplot(bank_df.balance))
print("Skweness is: ",bank_df['balance'].skew())
#clearly there are large number of ourliers and needs to be celaned. Using mean +_3*std for cleaning outliers
sns.boxplot(y='balance',data=bank_df)
#day seems fine with very low skweness. However its multimodal.
print(sns.distplot(bank_df.day))
print("Skweness is: ",bank_df['day'].skew())
#day seems fine here
sns.boxplot(y='day',data=bank_df)
#duration, is highly skewed.
print(sns.distplot(bank_df.duration))
print("Skweness is: ",bank_df['duration'].skew())
#duration also contains outliers which we need to clean.
#this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also,
#after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the
#intention is to have a realistic predictive model.
#So droping this
sns.boxplot(y='duration',data=bank_df)
#bank_df.drop(['duration'],inplace=True,axis=1)
#bank_df.columns
#campaign is highly skewed and has very long tail on right.
print(sns.distplot(bank_df.campaign))
print("Skweness is: ",bank_df['campaign'].skew())
#campaign contains lot of outliers, it needs to be checked. Conatcts made to customer can be high at times.
#thus data needs to be checked
sns.boxplot(y='campaign',data=bank_df)
#pdays, max respondents has value as -1. However days cant be -1, believe they are ones who were not previously contacted
print(sns.distplot(bank_df.pdays))
print("Skweness is: ",bank_df['pdays'].skew())
#pdays
sns.boxplot(y='pdays',data=bank_df)
#previous, responses as in pdays -1 are marked as 0 here.
#They were never contacted in previous marketing campaign hence no contacts performed.
#No of contacts made to customers decrease which is obvious
#cal=bank_df[bank_df['previous']>0]
bank_df['previous'].value_counts()
sns.boxplot(y='pdays',data=bank_df)
#checking mean and median against target variable for all numeric varibale
#those subscribe to term deposit have higher mean and median balance
#those subscribe to term deposit have higher mean and median duration
#those subscribe to term deposit have higher mean pdays, number of days that passed by after the client
# was last contacted from a previous campaign
#those subscribe to term deposit have higher mean previous, number of contacts performed before this campaign
#those subscribe to term deposit have lower mean campaign, number of contacts performed during this campaign
bank_df.groupby(['Target']).agg(['mean','median'])
#job - blue collar jobs was the most contacted in marketing campaign however they are the least who subscribed it.
#Student have mostly subscribed term deposit, however they are least contacted in marketing campaign
print(pd.crosstab(bank_df['job'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(15,5))
sns.countplot(x='job',hue='Target',data=bank_df)
#Married was the most conatcted group however they are the least who subscribed it.
#Single are the most who subscribed for term deposit
print(pd.crosstab(bank_df['marital'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(5,5))
sns.countplot(x='marital',hue='Target',data=bank_df)
#tertiary education mostly subscribed to term deposit
print(pd.crosstab(bank_df['education'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(5,5))
sns.countplot(x='education',hue='Target',data=bank_df)
#defaulter/non defaultor- those who subscribed to term deposit is very low. dont believe will add much to our analysis
print(pd.crosstab(bank_df['default'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(5,5))
sns.countplot(x='default',hue='Target',data=bank_df)
print(pd.crosstab(bank_df['housing'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(5,5))
sns.countplot(x='housing',hue='Target',data=bank_df)
print(pd.crosstab(bank_df['loan'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(5,5))
sns.countplot(x='loan',hue='Target',data=bank_df)
print(pd.crosstab(bank_df['contact'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(5,5))
sns.countplot(x='contact',hue='Target',data=bank_df)
print(pd.crosstab(bank_df['day'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(15,5))
sns.countplot(x='day',hue='Target',data=bank_df)
#mar, dec, sep are months where most term deposits were subscribed, however they have least client conatcts made
print(pd.crosstab(bank_df['month'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(15,5))
sns.countplot(x='month',hue='Target',data=bank_df)
#those who subscribed to previous campaign subscribed to term deposit as well.
#however the number is quite low in data for those subscribed to previous capaign.
print(pd.crosstab(bank_df['poutcome'],bank_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(15,5))
sns.countplot(x='poutcome',hue='Target',data=bank_df)
#campaign with lower numver of conatcts have higher chance of getting converted
sns.pairplot(bank_df,hue='Target')
#none of the variables seems to be highly correlated to each other, lets try dividing them into categories
bank_corr=bank_df.corr()
plt.subplots(figsize =(15, 10))
sns.heatmap(bank_corr,cmap="YlGnBu",annot=True)
## Binning the variables to overcome the skweness, to deal with outliers
#Binning Balance
def bal_group(series):
if series < 0:
return "negative balance"
elif 0 <= series < 500:
return "low balance"
elif 500 <= series < 2000:
return "moderate balance"
elif 2000 <= series < 10000:
return "high balance"
elif 10000 <= series:
return "very high balance"
bank_df['bal_group'] = bank_df['balance'].apply(bal_group)
bank_df['bal_group'].value_counts()
pd.crosstab(bank_df['bal_group'],bank_df['Target'])
#Binning Campaign
def camp_group(series):
if series <= 1:
return "1st Contact"
elif 2 <= series <= 5:
return "2-5 Contacts"
elif 6 <= series <= 10:
return "6-10 Contacts"
elif 10 < series:
return "More than 10 Contacts"
bank_df['camp_group'] = bank_df['campaign'].apply(camp_group)
#Most of people subscribe with lower number of contacts
bank_df['camp_group'].value_counts()
pd.crosstab(bank_df['camp_group'],bank_df['Target'])
#Binning Campaign
def dura_group(series):
if series <= 120:
return "<2 mints Duration"
elif 120 < series <= 600:
return "<2-10 mints Duration"
elif 600 < series <= 1800:
return "<10-30 mints Contacts"
elif 1800 < series:
return "More than 30 mints Contacts"
bank_df['dura_group'] = bank_df['duration'].apply(dura_group)
#
bank_df['dura_group'].value_counts()
pd.crosstab(bank_df['dura_group'],bank_df['Target'])
bank_df_1=bank_df.copy()
bank_df.drop(['balance','campaign'],axis=1,inplace=True)
bank_df.columns
#Converting day to categorical
bank_df['day']=bank_df['day'].astype('category')
bank_df['Target']=bank_df['Target'].astype('category')
bank_df.dtypes
# creating a dict file
label = {'yes': 1,'no': 0}
bank_df.loan = [label[item] for item in bank_df.loan]
bank_df.default = [label[item] for item in bank_df.default]
bank_df.Target = [label[item] for item in bank_df.Target]
bank_df.head()
bank_df_obj=pd.DataFrame(bank_df.select_dtypes(include='object'))
bank_df_obj=pd.get_dummies(bank_df_obj)
bank_df_2 = pd.concat([bank_df,bank_df_obj], axis=1)
print(bank_df_2.columns)
bank_df_3=bank_df_2.drop(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day',
'month','bal_group', 'camp_group','duration','poutcome','dura_group'],axis=1)
bank_df_3.info()
bank_df_3.shape
#Test Train Split
X=bank_df_3.drop(['Target'],axis=1)
y=bank_df_3['Target']
#Test Train Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
from sklearn.tree import DecisionTreeClassifier
dt_algo = DecisionTreeClassifier(random_state=101)
dt_algo.fit(X_train, y_train)
y_pred = dt_algo.predict(X_test)
print(dt_algo.score(X_train, y_train))
print(dt_algo.score(X_test , y_test))
#the model overfits, its accuracy on test data is good. However recall and precision are low. We want model
#to predict better so as we don't waste time in handling customers which are not likely to subscribe
# metrics
from sklearn import metrics
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred))
# precision
print("precision", metrics.precision_score(y_test, y_pred))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred))
#ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
dt_roc_auc = roc_auc_score(y_test, dt_algo.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, dt_algo.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Decision Tree (area = %0.2f)' % dt_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
#Regularization using GridSearchCV
dt_algo1 = DecisionTreeClassifier(random_state = 102)
params = {"max_depth": np.arange(8, 20),"max_features":np.arange(15,55,5),'min_samples_leaf': range(45, 65, 5),
'min_samples_split': range(2,5),
'criterion': ["entropy", "gini"]}
from sklearn.model_selection import GridSearchCV
model_cv = GridSearchCV(estimator = dt_algo1, param_grid = params,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv.fit(X_train, y_train)
# results of grid search CV
cv_results = pd.DataFrame(model_cv.cv_results_)
#cv_results
#parameters best value
best_score = model_cv.best_score_
best = model_cv.best_params_
best
#using best parameter values
dt_algo_best = DecisionTreeClassifier(max_depth= 14, max_features= 40,random_state=103,min_samples_leaf=50,
min_samples_split=2,criterion='entropy')
dt_algo_best.fit(X_train, y_train)
# predict
y_pred1 = dt_algo_best.predict(X_test)
#accuracy and precision increase but recall decreases though by less amount and also roc_auc metric
#decreases by .04.
# metrics
# confusion matrix
print("confusion matrix: \n", metrics.confusion_matrix(y_test, y_pred1))
# accuracy
print("accuracy: ", metrics.accuracy_score(y_test, y_pred1))
# precision
print("precision: ", metrics.precision_score(y_test, y_pred1))
# recall/sensitivity
print("recall: ", metrics.recall_score(y_test, y_pred1))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred1))
#ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
dt_roc_auc1 = roc_auc_score(y_test, dt_algo_best.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, dt_algo_best.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Decision Tree (area = %0.2f)' % dt_roc_auc1)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC_DecisionTree')
plt.show()
#Feature Importance
dt_imp_feature=pd.DataFrame(dt_algo_best.feature_importances_, columns = ["Imp"], index = X_train.columns)
dt_imp_feature.sort_values(by="Imp",ascending=False)[:15]
#dt_imp_feature.sort_values(by="Imp",ascending=False)
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/graphviz-2.38/release/bin/'
#Tree structure
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
features = X_train.columns
dot_data = StringIO()
export_graphviz(dt_algo_best, out_file=dot_data,feature_names=features,filled=True,rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree.png')
Image(graph.create_png())
from sklearn.ensemble import BaggingClassifier
bg_bank = BaggingClassifier(random_state=150)
bg_bank.fit(X_train, y_train)
y_pred_bg = bg_bank.predict(X_test)
print(bg_bank.score(X_train, y_train))
print(bg_bank.score(X_test , y_test))
#this algo seems overfit, also its recall is low
#metrices
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_bg)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_bg))
# precision
print("precision", metrics.precision_score(y_test, y_pred_bg))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_bg))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_bg))
#Regularization using GridSearchCV
bg_bank1 = BaggingClassifier(random_state=151)
params = {"n_estimators": np.arange(30,50,2),"max_features":[0.78,0.8,0.82,0.84],
'max_samples': [0.45,0.5,0.55,0.6],'oob_score':['True']}
model_cv_bg = GridSearchCV(estimator = bg_bank1, param_grid = params,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv_bg.fit(X_train, y_train)
# results of grid search CV
cv_results_bg = pd.DataFrame(model_cv_bg.cv_results_)
#cv_results_bg
#parameters best value
best_score_bg = model_cv_bg.best_score_
best_bg = model_cv_bg.best_params_
best_bg
bg_algo_best = BaggingClassifier(max_features= 0.8, max_samples=0.8,n_estimators=42,oob_score=True,random_state=152)
bg_algo_best.fit(X_train, y_train)
# predict
y_pred1_bg = bg_algo_best.predict(X_test)
#Accuracy improves and so does precision, but recall drops .
# metrics
# confusion matrix
print("confusion matrix: \n", metrics.confusion_matrix(y_test, y_pred1_bg))
# accuracy
print("accuracy: ", metrics.accuracy_score(y_test, y_pred1_bg))
# precision
print("precision: ", metrics.precision_score(y_test, y_pred1_bg))
# recall/sensitivity
print("recall: ", metrics.recall_score(y_test, y_pred1_bg))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred1_bg))
#ROC
bg_roc_auc = roc_auc_score(y_test, bg_algo_best.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, bg_algo_best.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Bagging (area = %0.2f)' % bg_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC_Bagging')
plt.show()
#Feature Importance
feature_importances = np.mean([ tree.feature_importances_ for tree in bg_algo_best.estimators_], axis=0)
bg_imp_feature=pd.DataFrame(feature_importances, columns = ["Imp"])
bg_imp_feature.sort_values(by="Imp",ascending=False)
#RandomForest Algo
from sklearn.ensemble import RandomForestClassifier
rf_bank = RandomForestClassifier(random_state=200)
rf_bank.fit(X_train, y_train)
y_pred_rf = rf_bank.predict(X_test)
print(rf_bank.score(X_train, y_train))
print(rf_bank.score(X_test , y_test))
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_rf)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_rf))
# precision
print("precision", metrics.precision_score(y_test, y_pred_rf))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_rf))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_rf))
#Regularization using GridSearchCV
rf_bank1 = RandomForestClassifier(random_state = 201,oob_score="True",bootstrap=True)
params = {"n_estimators": np.arange(12,18,2),'criterion': ["entropy"],"max_depth": np.arange(9, 15,2),
"max_features":np.arange(15,30,5),'min_samples_leaf': range(26, 32, 2),
'min_samples_split': range(26, 32, 2)}
model_cv_rf = GridSearchCV(estimator = rf_bank1, param_grid = params,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv_rf.fit(X_train, y_train)
# results of grid search CV
cv_results_rf = pd.DataFrame(model_cv_rf.cv_results_)
#cv_results_rf
#parameters best value
best_score_rf = model_cv_rf.best_score_
best_rf = model_cv_rf.best_params_
best_rf
rf_bank_best = RandomForestClassifier(max_depth= 13, max_features= 25,random_state=202,
n_estimators=16,criterion='entropy',
min_samples_leaf=30,min_samples_split=30)
rf_bank_best.fit(X_train, y_train)
# predict
y_pred1_rf = rf_bank_best.predict(X_test)
# metrics
# confusion matrix
print("confusion matrix: \n", metrics.confusion_matrix(y_test, y_pred1_rf))
# accuracy
print("accuracy: ", metrics.accuracy_score(y_test, y_pred1_rf))
# precision
print("precision: ", metrics.precision_score(y_test, y_pred1_rf))
# recall/sensitivity
print("recall: ", metrics.recall_score(y_test, y_pred1_rf))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred1_rf))
#ROC
rf_roc_auc = roc_auc_score(y_test, rf_bank_best.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, rf_bank_best.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC_RandomForest')
plt.show()
#Feature Importance
rf_imp_feature=pd.DataFrame(rf_bank_best.feature_importances_, columns = ["Imp"], index = X_train.columns)
rf_imp_feature.sort_values(by="Imp",ascending=False)[:15]
#AdaBoost Algo
from sklearn.ensemble import AdaBoostClassifier
# base estimator
tree = DecisionTreeClassifier(max_depth=2)
# adaboost with the tree as base estimator
ada_bank = AdaBoostClassifier(
base_estimator=tree,
algorithm="SAMME",random_state=250)
ada_bank.fit(X_train, y_train)
y_pred_ada = ada_bank.predict(X_test)
print(ada_bank.score(X_train, y_train))
print(ada_bank.score(X_test , y_test))
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_ada))
# precision
print("precision", metrics.precision_score(y_test, y_pred_ada))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_ada))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_ada))
#Regularization using GridSearchCV
ada_bank1 = AdaBoostClassifier(base_estimator=tree,algorithm="SAMME",random_state=251)
# parameter grid
params = {"base_estimator__max_depth" : np.arange(2, 8,2),"n_estimators": [150,200,250],
"learning_rate":[0.2,0.3,0.4]}
model_cv_ada = GridSearchCV(estimator = ada_bank1, param_grid = params,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv_ada.fit(X_train, y_train)
# results of grid search CV
cv_results_ada = pd.DataFrame(model_cv_ada.cv_results_)
#cv_results_ada
#parameters best value
best_score_ada = model_cv_ada.best_score_
best_ada = model_cv_ada.best_params_
best_ada
# base estimator
tree = DecisionTreeClassifier(max_depth=4)
ada_bank_best = AdaBoostClassifier(base_estimator=tree , n_estimators=200,
random_state=252,learning_rate=0.3)
ada_bank_best.fit(X_train, y_train)
# predict
y_pred1_ada = ada_bank_best.predict(X_test)
# metrics
# confusion matrix
print("confusion matrix: \n", metrics.confusion_matrix(y_test, y_pred1_ada))
# accuracy
print("accuracy: ", metrics.accuracy_score(y_test, y_pred1_ada))
# precision
print("precision: ", metrics.precision_score(y_test, y_pred1_ada))
# recall/sensitivity
print("recall: ", metrics.recall_score(y_test, y_pred1_ada))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred1_ada))
#ROC
ada_roc_auc = roc_auc_score(y_test, ada_bank_best.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, ada_bank_best.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Ada Boost (area = %0.2f)' % ada_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC_AdaBoost')
plt.show()
#Feature Importance
ada_imp_feature=pd.DataFrame(ada_bank_best.feature_importances_, columns = ["Imp"], index = X_train.columns)
ada_imp_feature.sort_values(by="Imp",ascending=False)[:15]
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
gbc_bank = GradientBoostingClassifier(random_state=300)
gbc_bank.fit(X_train, y_train)
y_pred_gbc = gbc_bank.predict(X_test)
print(gbc_bank.score(X_train, y_train))
print(gbc_bank.score(X_test , y_test))
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_gbc)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_gbc))
# precision
print("precision", metrics.precision_score(y_test, y_pred_gbc))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_gbc))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_gbc))
###As this run is taking very long, adjusting 1 parameter in grid search. However use trial and error by passing single
###value in fit
#Regularization using GridSearchCV
gbc_bank = GradientBoostingClassifier(random_state=301)
params = {"n_estimators": [200,210,220]}#,"learning_rate":[0.1,0.2],"max_depth": np.arange(10, 16)}
# "max_features":np.arange(36,50,2),'min_samples_leaf': range(45, 60, 5)}
model_cv_gbc = GridSearchCV(estimator = gbc_bank, param_grid = params,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv_gbc.fit(X_train, y_train)
# results of grid search CV
cv_results_gbc = pd.DataFrame(model_cv_gbc.cv_results_)
#cv_results_gbc
#parameters best value
best_score_gbc = model_cv_gbc.best_score_
best_gbc = model_cv_gbc.best_params_
best_gbc
#After fitting best parameters
gbc_bank_best = GradientBoostingClassifier(learning_rate= 0.1, n_estimators= 220,max_depth= 14,
max_features= 42,random_state=103,min_samples_leaf=50,min_samples_split=50)
gbc_bank_best.fit(X_train, y_train)
# predict
y_pred1_gbc = gbc_bank_best.predict(X_test)
# metrics
# confusion matrix
print("confusion matrix: \n", metrics.confusion_matrix(y_test, y_pred1_gbc))
# accuracy
print("accuracy: ", metrics.accuracy_score(y_test, y_pred1_gbc))
# precision
print("precision: ", metrics.precision_score(y_test, y_pred1_gbc))
# recall/sensitivity
print("recall: ", metrics.recall_score(y_test, y_pred1_gbc))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred1_gbc))
#ROC
gbc_roc_auc = roc_auc_score(y_test, gbc_bank_best.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, gbc_bank_best.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='gbc Boost (area = %0.2f)' % gbc_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC_GradientBoost')
plt.show()
#Feature Importance
gbc_imp_feature=pd.DataFrame(gbc_bank_best.feature_importances_, columns = ["Imp"], index = X_train.columns)
gbc_imp_feature.sort_values(by="Imp",ascending=False)[:15]
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
train_data = np.array(X_train)
test_data = np.array(X_test)
xgb_bank = XGBClassifier(random_state=400)
xgb_bank.fit(train_data, y_train)
y_pred_xgb = xgb_bank.predict(test_data)
print(xgb_bank.score(train_data, y_train))
print(xgb_bank.score(test_data , y_test))
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgb)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_xgb))
# precision
print("precision", metrics.precision_score(y_test, y_pred_xgb))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_xgb))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_xgb))
#Regularization using GridSearchCV - 1st Iteration
params1 = {
"colsample_bytree": [i/100.0 for i in range(78,82,2)],
"learning_rate": [0.2,0.3],
"n_estimators": [142,144,146],
"subsample": [i/100.0 for i in range(80,84,2)]
}
model_cv_xgb1 = GridSearchCV(estimator = xgb_bank, param_grid = params1,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv_xgb1.fit(train_data,y_train)
# results of grid search CV
cv_results_xgb1 = pd.DataFrame(model_cv_xgb1.cv_results_)
cv_results_xgb1
#parameters best value
best_score_xgb1 = model_cv_xgb1.best_score_
best_xgb1 = model_cv_xgb1.best_params_
best_xgb1
#Choosing best parameter from 1st Iteration
xgb_bank_best1 = XGBClassifier(colsample_bytree=0.8,learning_rate=0.2,n_estimators=144,subsample=0.82)
xgb_bank_best1.fit(train_data, y_train)
# predict
y_pred_xgb1 = xgb_bank_best1.predict(test_data)
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgb1)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_xgb1))
# precision
print("precision", metrics.precision_score(y_test, y_pred_xgb1))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_xgb1))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_xgb1))
#Regularization using GridSearchCV - 2nd Iteration
params2 = {
'min_child_weight':[4,5,6,7],"max_depth": [2,4,6],
}
model_cv_xgb2 = GridSearchCV(estimator = xgb_bank_best1, param_grid = params2,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
#Choosing best parameter obtained from 2nd Iteration an apply to model of 1st iteration
model_cv_xgb2.fit(train_data,y_train)
# results of grid search CV
cv_results_xgb2 = pd.DataFrame(model_cv_xgb2.cv_results_)
cv_results_xgb2
#parameters best value
best_score_xgb2 = model_cv_xgb2.best_score_
best_xgb2 = model_cv_xgb2.best_params_
best_xgb2
xgb_bank_best2 = XGBClassifier(colsample_bytree=0.8,learning_rate=0.2,n_estimators=144,subsample=0.82,
min_child_weight=6,max_depth=4)
xgb_bank_best2.fit(train_data, y_train)
# predict
y_pred_xgb2 = xgb_bank_best1.predict(test_data)
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgb2)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_xgb2))
# precision
print("precision", metrics.precision_score(y_test, y_pred_xgb2))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_xgb2))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_xgb2))
#Regularization using GridSearchCV - 3rd Iteration
params3 = {
'gamma':[0.3,0.35,0.4,0.45]
}
model_cv_xgb3 = GridSearchCV(estimator = xgb_bank_best2, param_grid = params3,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv_xgb3.fit(train_data,y_train)
# results of grid search CV
cv_results_xgb3 = pd.DataFrame(model_cv_xgb3.cv_results_)
#parameters best value
best_score_xgb3 = model_cv_xgb3.best_score_
best_xgb3 = model_cv_xgb3.best_params_
best_xgb3
xgb_bank_best3 = XGBClassifier(colsample_bytree=0.8,learning_rate=0.2,n_estimators=144,subsample=0.82,
min_child_weight=6,max_depth=4,gamma=0.4)
xgb_bank_best3.fit(train_data, y_train)
# predict
y_pred_xgb3 = xgb_bank_best3.predict(test_data)
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgb3)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_xgb3))
# precision
print("precision", metrics.precision_score(y_test, y_pred_xgb3))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_xgb3))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_xgb3))
#Regularization using GridSearchCV - 4th Iteration
params4 = {
'reg_lambda':[1e-2,0.05,0.1]
}
model_cv_xgb4 = GridSearchCV(estimator = xgb_bank_best3, param_grid = params4,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv_xgb4.fit(train_data,y_train)
# results of grid search CV
cv_results_xgb4 = pd.DataFrame(model_cv_xgb4.cv_results_)
#parameters best value
best_score_xgb4 = model_cv_xgb4.best_score_
best_xgb4 = model_cv_xgb4.best_params_
best_xgb4
xgb_bank_best4 = XGBClassifier(colsample_bytree=0.8,learning_rate=0.2,n_estimators=144,subsample=0.82,
min_child_weight=6,max_depth=4,gamma=0.4,reg_lambda=0.05)
xgb_bank_best4.fit(train_data, y_train)
# predict
y_pred_xgb4 = xgb_bank_best4.predict(test_data)
#confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgb4)
print("confusion matrix: \n",confusion_matrix)
# accuracy
print("accuracy", metrics.accuracy_score(y_test, y_pred_xgb4))
# precision
print("precision", metrics.precision_score(y_test, y_pred_xgb4))
# recall/sensitivity
print("recall", metrics.recall_score(y_test, y_pred_xgb4))
#Area under curve
print("area-under-curve metric: ", metrics.roc_auc_score(y_test, y_pred_xgb4))
#ROC
Xgbc_roc_auc = roc_auc_score(y_test, y_pred_xgb4)
fpr, tpr, thresholds = roc_curve(y_test, xgb_bank_best4.predict_proba(test_data)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='gbc Boost (area = %0.2f)' % Xgbc_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC_XBBoost')
plt.show()
#Feature Importance
gbc_imp_feature=pd.DataFrame(xgb_bank_best4.feature_importances_, columns = ["Imp"], index = X_train.columns)
gbc_imp_feature.sort_values(by="Imp",ascending=False)[:16]